Slid(er)ing Into Batters’ DMs:

How and Why Usage of the Slider Has Changed Across Major League Baseball

Custom Theme

First I need to set a custom theme to make my plots look pretty…

custom_theme <- function() { 
  font <- "Helvetica" # font selection
    
    theme_minimal() %+replace% # theme based on minimal with following replacements
    
    theme(panel.grid.major = element_blank(), # leave grids and axis ticks blank
          panel.grid.minor = element_blank(),
          axis.ticks = element_blank(),
          axis.line = element_line(color = "black",
                                   size = 1),
          panel.border = element_rect(color = "black",
                                      fill=NA,
                                      size=1),
          plot.title = element_text(family = font,
                                    size = 20,
                                    face = 'bold',
                                    hjust = 0.5, # move title to center horizontally
                                    vjust = 2), # move title up a wee bit
          plot.subtitle = element_text(family = font,
                                       size = 15,
                                       hjust = 0.5),
          plot.caption = element_text(family = font,
                                      size = 10,
                                      hjust = 1), # put caption in right corner
          axis.title = element_text(family = font,
                                    face = 'italic',
                                    size = 15),
          axis.text = element_text(family = font,
                                   size = 10),
          axis.text.x = element_text(margin = margin(t = 2, # top
                                                     r = 2, # right
                                                     b = 2, # bottom
                                                     l = 2))) # left
}

Data Aggregation

I need to aggregate quite a bit of data for this research project. All data was obtained from Baseball Savant (link).

Pitch Usage Data Aggregation

I will first want to look at trends in pitch usage over time. These datasets are also going to include statistics such as spin_rate, release_extension, launch_speed, etc. that will be interesting to look at to hypothesize about the factors that influence pitch outcome.

library(dplyr) # data manip
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(data.table) # creating dfs
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
# function to compile .csv files per pitch and organize
pitch_usage_aggregate_function <- function(pitch_usage_files_path) {
  
pitch_usage_aggregate <- rbindlist(mapply(c,(list.files(path = pitch_usage_files_path,
                                                  pattern = "*.csv",
                                                  full.names = TRUE) %>%
                                               lapply(read.table,
                                                      header = TRUE,
                                                      sep = ",",
                                                      encoding = "UTF-8")),
                                          (list.files(path = pitch_usage_files_path,
                                                      pattern = "*.csv",
                                                      full.names = TRUE) %>%
                                             basename() %>%
                                             as.list()),
                                          SIMPLIFY = FALSE),
                                   fill = T) %>% 
  mutate(year = substr(V1, 1, 4)) %>% # pull year from col V1
  mutate(pitch = substr(V1, 6, 9)) %>% # pull pitch abr from col V1
  mutate(pitch = recode(pitch, Chan = 'Change', Cutt = 'Cutter', Curv =  'Curveball',
                        Fast = 'Fastball', Sink = 'Sinker', Slid = 'Slider')) %>% 
  select(-c(pos3_int_start_distance, pos4_int_start_distance,
            pos5_int_start_distance, pos6_int_start_distance, 
            pos7_int_start_distance, pos8_int_start_distance, 
            pos9_int_start_distance, V1)) %>% # remove select cols
  select(player_name, player_id, year:pitch, pitches:pitch_percent, spin_rate:velocity,
         release_extension, effective_speed, eff_min_vel, ba:launch_angle,
         whiffs:takes) # reorder cols
  
return(as.data.frame(pitch_usage_aggregate))
}

# set paths to pitch files
changeup_usage_path <- "Data/Pitch Usage/Change"
curveball_usage_path <- "Data/Pitch Usage/Curveball"
cutter_usage_path <- "Data/Pitch Usage/Cutter"
fastball_usage_path <- "Data/Pitch Usage/Fastball"
sinker_usage_path <- "Data/Pitch Usage/Sinker"
slider_usage_path <- "Data/Pitch Usage/Slider"

# aggregate pitch files into dfs

changeup_usage_aggregate <- pitch_usage_aggregate_function(changeup_usage_path)
curveball_usage_aggregate <- pitch_usage_aggregate_function(curveball_usage_path)
cutter_usage_aggregate <- pitch_usage_aggregate_function(cutter_usage_path)
fastball_usage_aggregate <- pitch_usage_aggregate_function(fastball_usage_path)
sinker_usage_aggregate <- pitch_usage_aggregate_function(sinker_usage_path)
slider_usage_aggregate <- pitch_usage_aggregate_function(slider_usage_path)

# list of all aggregate dfs
master_pitch_usage_aggregate_list <- list(changeup_usage_aggregate = changeup_usage_aggregate,
                                         curveball_usage_aggregate = curveball_usage_aggregate,
                                         cutter_usage_aggregate = cutter_usage_aggregate,
                                         fastball_usage_aggregate = fastball_usage_aggregate,
                                         sinker_usage_aggregate = sinker_usage_aggregate,
                                         slider_usage_aggregate = slider_usage_aggregate)

# write aggregate dfs into .csv files
mapply(write.csv, 
       master_pitch_usage_aggregate_list, 
       file=paste0(names(master_pitch_usage_aggregate_list),
                   '.csv'))
## $changeup_usage_aggregate
## NULL
## 
## $curveball_usage_aggregate
## NULL
## 
## $cutter_usage_aggregate
## NULL
## 
## $fastball_usage_aggregate
## NULL
## 
## $sinker_usage_aggregate
## NULL
## 
## $slider_usage_aggregate
## NULL
# combine aggregate dfs into one master df
master_pitch_usage_aggregate <- rbind(changeup_usage_aggregate,
                                      curveball_usage_aggregate,
                                      cutter_usage_aggregate,
                                      fastball_usage_aggregate,
                                      sinker_usage_aggregate,
                                      slider_usage_aggregate)

# write master df to .csv file
write.csv(master_pitch_usage_aggregate,
          "Data/Pitch Usage/Aggregated Files/master_pitch_usage_aggregate.csv",
          row.names = TRUE)

Pitch Movement Data Aggregation

I’l

pitch_move_aggregate_function <- function(pitch_move_files_path) {
  
pitch_move_aggregate <- rbindlist(mapply(c,(list.files(path = pitch_move_files_path,
                                                       pattern = "*.csv",
                                                       full.names = TRUE) %>%
                                              lapply(read.table,
                                                     header = TRUE,
                                                     sep = ",",
                                                     encoding = "UTF-8")),
                                         (list.files(path = pitch_move_files_path,
                                                     pattern = "*.csv",
                                                     full.names = TRUE) %>%
                                            basename() %>%
                                            as.list()),
                                         SIMPLIFY = FALSE),
                                  fill = T) %>%
  mutate(pitch = recode(pitch_type_name, Changeup = 'Change', '4-Seamer' = "Fastball")) %>%
  mutate(player_id = pitcher_id,
         horizontal_break = pitcher_break_x,
         vertical_break = pitcher_break_z) %>% 
  select(-V1) %>% # remove select cols
  select(year, player_id, last_name, first_name, pitch_hand, 
         total_pitches, pitch, pitches_thrown,
         avg_speed, rise, tail, horizontal_break, vertical_break) # reorder cols

return(as.data.frame(pitch_move_aggregate))
}

# set paths to pitch files
all_move_path <- "Data/Movement/All Pitches"

# aggregate movement files into dfs

all_move_aggregate <- pitch_move_aggregate_function(all_move_path)
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec, :
## EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec, :
## number of items read is not a multiple of the number of columns
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec, :
## EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec, :
## number of items read is not a multiple of the number of columns
# write master df to .csv file
write.csv(all_move_aggregate,
          "Data/Movement/All Pitches/master_pitch_movement_aggregate.csv",
          row.names = TRUE)

Slider Location Data Aggregation

slider_location_aggregate_function <- function(slider_location_files_path) {
  
slider_location_aggregate <- rbindlist(mapply(c,(list.files(path = slider_location_files_path,
                                                            pattern = "*.csv",
                                                            full.names = TRUE) %>%
                                                   lapply(read.table,
                                                          header = TRUE,
                                                          sep = ",",
                                                          encoding = "UTF-8")),
                                              (list.files(path = slider_location_files_path,
                                                          pattern = "*.csv",
                                                          full.names = TRUE) %>%
                                                 basename() %>%
                                                 as.list()),
                                              SIMPLIFY = FALSE),
                                       fill = T) %>%
  mutate(year = substr(V1, 1, 4)) %>% # pull year from col V1
  mutate(pitch = substr(V1, 14, 19)) %>% # pull pitch abr from col V1
  mutate(zone = substr(V1, 11, 12)) %>% # pull zone from col V1
  select(-c(pos3_int_start_distance, pos4_int_start_distance,
            pos5_int_start_distance, pos6_int_start_distance, 
            pos7_int_start_distance, pos8_int_start_distance, 
            pos9_int_start_distance, V1)) %>% # remove select cols
  select(player_name, player_id, year:pitch, zone, pitches:pitch_percent, spin_rate:velocity,
         release_extension, effective_speed, eff_min_vel, ba:launch_angle,
         whiffs:takes) # reorder cols

return(as.data.frame(slider_location_aggregate))
}

# set paths to pitch files
slider_location_all_path <- "Data/Location/Slider/All"

# aggregate movement files into dfs

slider_location_all_aggregate <- slider_location_aggregate_function(slider_location_all_path)

# write master df to .csv file
write.csv(slider_location_all_aggregate,
          "Data/Location/Slider/All/Aggregate Files/master_slider_location_all_aggregate.csv",
          row.names = TRUE)

Pitch Usage Analysis

Statistics

library(tidyr)

# aggregate all pitch usage stats

master_pitch_usage_summary <- master_pitch_usage_aggregate %>% 
  group_by(year, pitch) %>% 
  filter(total_pitches > 100) %>%
  select(-c(player_name, total_pitches, pitch_percent, ba)) %>%
  summarise(pitch_count = sum(pitches),
            sum_whiffs = sum(whiffs),
            sum_swings = sum(swings),
            sum_takes = sum(takes),
            sum_abs = sum(abs),
            sum_hits = sum(hits),
            mean_spin_rate = weighted.mean(spin_rate, pitches),
            mean_velocity = weighted.mean(velocity, pitches),
            mean_release_extension = weighted.mean(release_extension, pitches),
            mean_effective_speed = weighted.mean(effective_speed, pitches),
            mean_eff_min_vel = weighted.mean(eff_min_vel, pitches),
            mean_iso = weighted.mean(iso, pitches),
            mean_babip = weighted.mean(babip, pitches),
            mean_slg = weighted.mean(slg, pitches),
            mean_woba = weighted.mean(woba, pitches),
            mean_xwoba = weighted.mean(xwoba, pitches),
            mean_xba = weighted.mean(xba, pitches),
            mean_launch_speed = weighted.mean(launch_speed, pitches),
            mean_launch_angle = weighted.mean(launch_angle, pitches)) %>% 
  mutate(ba = sum_hits / sum_abs,
         swing_percentage = sum_swings / pitch_count,
         whiff_percentage = sum_whiffs / sum_swings,
         contact_rate = (sum_swings - sum_whiffs) / sum_swings,
         take_percentage = sum_takes / pitch_count)
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
# save and export
write.csv(master_pitch_usage_summary,
          "Data/Pitch Usage/Statistics/master_pitch_usage_summary.csv",
          row.names = TRUE)

# calculate total number of pitches per year
total_pitches <- master_pitch_usage_aggregate %>% 
  group_by(year, pitch) %>%
  #filter(year > 2014) %>% 
  summarise(pitch_count = sum(pitches)) %>% 
  ungroup() %>% 
  group_by(year) %>% 
  summarise(total_pitches = sum(pitch_count)) %>% 
  slice(rep(1:n(), each = 6)) %>% # rep 6 times to fit future need (see pitch_percentages)
  as.data.frame()
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
# calculate pitch usage %s

library(tibble) # add_col fun

master_pitch_usage_percentages <- master_pitch_usage_aggregate %>% 
  group_by(year, pitch) %>% 
  filter(total_pitches > 100) %>%
  summarise(pitch_count = sum(pitches)) %>% 
  select(year, pitch, pitch_count) %>% 
  ungroup() %>%
  add_column(all_pitches = total_pitches$total_pitches) %>% 
  mutate(pitch_usage = pitch_count/all_pitches)
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
# write the .csv
write.csv(master_pitch_usage_percentages,
          "Data/Pitch Usage/Statistics/master_pitch_usage_percentages.csv",
          row.names = TRUE)

# calc statcast era statistics - for use in full analyses

master_statcast_summary <- master_pitch_usage_aggregate %>% 
  group_by(year, pitch) %>% 
  filter(total_pitches > 100) %>%
  select(-c(player_name, total_pitches, pitch_percent, ba)) %>% 
  drop_na() %>% 
  summarise(pitch_count = sum(pitches),
            sum_whiffs = sum(whiffs),
            sum_swings = sum(swings),
            sum_takes = sum(takes),
            sum_abs = sum(abs),
            sum_hits = sum(hits),
            mean_spin_rate = weighted.mean(spin_rate, pitches),
            mean_velocity = weighted.mean(velocity, pitches),
            mean_release_extension = weighted.mean(release_extension, pitches),
            mean_effective_speed = weighted.mean(effective_speed, pitches),
            mean_eff_min_vel = weighted.mean(eff_min_vel, pitches),
            mean_iso = weighted.mean(iso, pitches),
            mean_babip = weighted.mean(babip, pitches),
            mean_slg = weighted.mean(slg, pitches),
            mean_woba = weighted.mean(woba, pitches),
            mean_xwoba = weighted.mean(xwoba, pitches),
            mean_xba = weighted.mean(xba, pitches),
            mean_launch_speed = weighted.mean(launch_speed, pitches),
            mean_launch_angle = weighted.mean(launch_angle, pitches)) %>% 
  mutate(ba = sum_hits / sum_abs,
         swing_percentage = sum_swings / pitch_count,
         whiff_percentage = sum_whiffs / sum_swings,
         contact_rate = (sum_swings - sum_whiffs) / sum_swings,
         take_percentage = sum_takes / pitch_count)
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
# calc stats for all recorded data

master_allyears_summary <- master_pitch_usage_aggregate %>% 
  group_by(year, pitch) %>% 
  filter(total_pitches > 100) %>%
  select(-c(player_name, total_pitches, pitch_percent, ba,
            spin_rate, release_extension, effective_speed, eff_min_vel,
            xwoba, xba, launch_speed, launch_angle)) %>% 
  drop_na() %>% 
  summarise(pitch_count = sum(pitches),
            sum_whiffs = sum(whiffs),
            sum_swings = sum(swings),
            sum_takes = sum(takes),
            sum_abs = sum(abs),
            sum_hits = sum(hits),
            mean_velocity = weighted.mean(velocity, pitches),
            mean_iso = weighted.mean(iso, pitches),
            mean_babip = weighted.mean(babip, pitches),
            mean_slg = weighted.mean(slg, pitches),
            mean_woba = weighted.mean(woba, pitches)) %>% 
  mutate(ba = sum_hits / sum_abs,
         swing_percentage = sum_swings / pitch_count,
         whiff_percentage = sum_whiffs / sum_swings,
         contact_rate = (sum_swings - sum_whiffs) / sum_swings,
         take_percentage = sum_takes / pitch_count)
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
# write .csv
write.csv(master_allyears_summary,
          "Data/Pitch Usage/Statistics/master_allyears_summary.csv",
          row.names = TRUE)

str(master_allyears_summary)
## gropd_df [90 × 18] (S3: grouped_df/tbl_df/tbl/data.frame)
##  $ year            : chr [1:90] "2008" "2008" "2008" "2008" ...
##  $ pitch           : chr [1:90] "Change" "Curveball" "Cutter" "Fastball" ...
##  $ pitch_count     : int [1:90] 67620 54907 34269 239842 167714 98813 68620 57641 35079 250581 ...
##  $ sum_whiffs      : int [1:90] 9791 5863 3290 17558 9089 14450 9674 6241 3555 18243 ...
##  $ sum_swings      : int [1:90] 34194 21487 16597 109023 74686 47548 34321 22545 17269 112422 ...
##  $ sum_takes       : int [1:90] 33424 33420 17670 130819 93028 51265 34298 35096 17809 138159 ...
##  $ sum_abs         : int [1:90] 17534 12747 8283 52163 39843 24992 17660 13456 8538 53789 ...
##  $ sum_hits        : int [1:90] 4521 2773 2131 14473 11678 5591 4446 2869 2137 14842 ...
##  $ mean_velocity   : num [1:90] 82.3 76.3 87.4 91.9 90.7 ...
##  $ mean_iso        : num [1:90] 0.158 0.117 0.149 0.174 0.153 ...
##  $ mean_babip      : num [1:90] 0.293 0.29 0.289 0.303 0.307 ...
##  $ mean_slg        : num [1:90] 0.418 0.335 0.407 0.451 0.446 ...
##  $ mean_woba       : num [1:90] 0.314 0.26 0.317 0.358 0.358 ...
##  $ ba              : num [1:90] 0.258 0.218 0.257 0.277 0.293 ...
##  $ swing_percentage: num [1:90] 0.506 0.391 0.484 0.455 0.445 ...
##  $ whiff_percentage: num [1:90] 0.286 0.273 0.198 0.161 0.122 ...
##  $ contact_rate    : num [1:90] 0.714 0.727 0.802 0.839 0.878 ...
##  $ take_percentage : num [1:90] 0.494 0.609 0.516 0.545 0.555 ...
##  - attr(*, "groups")= tibble [15 × 2] (S3: tbl_df/tbl/data.frame)
##   ..$ year : chr [1:15] "2008" "2009" "2010" "2011" ...
##   ..$ .rows: list<int> [1:15] 
##   .. ..$ : int [1:6] 1 2 3 4 5 6
##   .. ..$ : int [1:6] 7 8 9 10 11 12
##   .. ..$ : int [1:6] 13 14 15 16 17 18
##   .. ..$ : int [1:6] 19 20 21 22 23 24
##   .. ..$ : int [1:6] 25 26 27 28 29 30
##   .. ..$ : int [1:6] 31 32 33 34 35 36
##   .. ..$ : int [1:6] 37 38 39 40 41 42
##   .. ..$ : int [1:6] 43 44 45 46 47 48
##   .. ..$ : int [1:6] 49 50 51 52 53 54
##   .. ..$ : int [1:6] 55 56 57 58 59 60
##   .. ..$ : int [1:6] 61 62 63 64 65 66
##   .. ..$ : int [1:6] 67 68 69 70 71 72
##   .. ..$ : int [1:6] 73 74 75 76 77 78
##   .. ..$ : int [1:6] 79 80 81 82 83 84
##   .. ..$ : int [1:6] 85 86 87 88 89 90
##   .. ..@ ptype: int(0) 
##   ..- attr(*, ".drop")= logi TRUE

Basic visualizations

Pitch Outcomes

Swings & Contact

How often do batters swing at each pitch? I hypothesize that batters swing at sliders more often than other pitches because they look similar to a fastball. I also hypothesize that batters miss (i.e., lower contact rate) against sliders more often than other pitches because of the deceptive breaking movement from the pitcher’s hand.

library(ggpubr)

swing_plot <- ggplot(data = master_allyears_summary,
                     mapping = aes(x = year,
                                   y = swing_percentage,
                                   group = pitch)) +
  geom_line(aes(color = pitch)) +
  geom_point(aes(color = pitch)) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1, size = 8)) +
  labs(title = "Swing Rate by Pitch",
       y = "Swing Rate",
       x = "Season",
       color = "Pitch")

swing_plot

contact_plot <- ggplot(data = master_allyears_summary,
                       mapping = aes(x = year,
                                     y = contact_rate,
                                     group = pitch)) +
  geom_line(aes(color = pitch)) +
  geom_point(aes(color = pitch)) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1, size = 8)) +
  labs(title = "Contact Rate by Pitch",
       y = "Contact Rate",
       x = "Season",
       color = "Pitch")

contact_plot

swing_contact_plot <- ggarrange(swing_plot,
                                contact_plot,
                                ncol = 2,
                                nrow = 1,
                                common.legend = TRUE,
                                legend = "right")

swing_contact_plot

Hmm. So batter’s swing at sliders, cutters, and changeups the most. These are all anecdotally the pitches that look the most like fastballs. Cutters and sliders break away from the pitcher’s hand and are similar in speed to fastballs, while changeups drop and fade away from the pitcher’s hand and are slower.

More swings is only good if it leads to a favorable outcome for the pitcher. The best swing outcome is a miss because any time a ball is put in play the batter has a chance to get on base. Interestingly, contact rates for sliders are the lowest of any pitch. So this means that batters swing at sliders more often than many other pitches but miss with their swings more often than any other pitch. If your goal is to maximize swings and misses, than throwing the slider more often seems to be the best approach.

Contact quality

But what happens when a batter does make contact? If the contact that does occur is good, the benefit of missed swings against sliders might be canceled by the risk of more hits.

library(ggplot2)

ba_plot <- ggplot(data = master_statcast_summary,
                  mapping = aes(x = year,
                                y = ba,
                                group = pitch)) +
  geom_line(aes(color = pitch)) +
  geom_point(aes(color = pitch)) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  labs(title = "Batting Avg. by Pitch",
       y = "Batting Average",
       x = "Season",
       color = "Pitch")

ba_plot

xba_plot <- ggplot(data = master_statcast_summary,
                  mapping = aes(x = year,
                                y = mean_xba,
                                group = pitch)) +
  geom_line(aes(color = pitch)) +
  geom_point(aes(color = pitch)) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  labs(y = "Expected Batting Average",
       x = "Season",
       color = "Pitch")

xba_plot

babip_plot <- ggplot(data = master_statcast_summary,
                     mapping = aes(x = year,
                                   y = mean_babip,
                                   group = pitch)) +
  geom_line(aes(color = pitch)) +
  geom_point(aes(color = pitch)) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45)) +
  labs(y = "BABIP",
       x = "Season",
       color = "Pitch")

babip_plot

batting_averages_plot <- ggarrange(ba_plot,
                                   xba_plot,
                                   babip_plot,
                                   ncol = 3,
                                   nrow = 1,
                                   common.legend = TRUE,
                                   legend = "right")

batting_averages_plot

So batters have the lowest batting average and expected batting average against sliders. These metrics include strikeouts, so when you remove those and look only at batting average on balls in play you see that only the changeup has a lower BABIP.

I can even reverse calculate the strikeout rate against all the pitch types by subtracting the BA from BABIP, since the only outcomes that aren’t included in BABIP are strikeouts (and walks but that’s a separate topic).

strikeout_rate_table <- master_statcast_summary %>% 
  mutate(strikeout_rate = mean_babip - ba) %>% 
  select(year, pitch, strikeout_rate) %>% 
  pivot_wider(names_from = year,
              values_from = strikeout_rate)

strikeout_rate_table <- reactable(strikeout_rate_table,
                                  pagination = FALSE,
                                  defaultColDef = colDef(cell = data_bars(strikeout_rate_table,
                                                                          number_fmt = scales::percent,
                                                                          text_position = "above",
                                                                          round_edges = TRUE,
                                                                          max_value = 0.1,
                                                                          fill_color = "red")))
  
strikeout_rate_table

This aligns with the BA plots - batters strikeout the most against sliders and curveballs. But they swing at sliders much more often, so strikeout totals are likely higher.

Power

When batters make contact, how successful are they? Are they getting singles, doubles, triples, HRs?

slg_plot <- ggplot(data = master_statcast_summary,
                     mapping = aes(x = year,
                                   y = mean_slg,
                                   group = pitch)) +
  geom_point(aes(color = pitch)) +
  geom_line(aes(color = pitch)) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  labs(x = "Season",
       y = "Slugging Percentage",
       color = "Pitch")

slg_plot

Batters hit for less power against sliders than any other pitch.

Exit velocity

library(ggplot2)

exit_velo_plot <- ggplot(data = master_statcast_summary,
                         mapping = aes(x = year,
                                       group = pitch,
                                       y = mean_launch_speed)) +
  geom_line(aes(color = pitch)) +
  geom_point(aes(color = pitch)) +
  theme_minimal() +
  theme(legend.box.background = element_rect(colour = "black"),
        legend.title = element_text(hjust = 0.5, face = "bold"),
        axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  labs(title = "Exit Velo. By Pitch",
       x = "Season",
       y = "Exit Velocity (mph)",
       color = "Pitch")

exit_velo_plot

BA_exitvelo_plot <- ggarrange(ba_plot,
                              exit_velo_plot,
                              ncol = 2,
                              nrow = 1,
                              common.legend = TRUE,
                              legend = "right")
BA_exitvelo_plot

Similar to slugging. Batters make weak contact against sliders. Only changeups result in lower exit velocity.

What makes a slider effective?

Where are we so far?

So we’ve seen that slider usage has increased substantially over the past 15 years. This rise is likely due to the fact that batters swing at a lot of sliders and miss a lot. This correlates with the high strikeout rates that I reverse calculated. When batters do make contact, it results in fewer hits and fewer extra bases.

Where are we going?

Why do batters suck against the slider? Sliders are often nearly as fast as fastballs but move.

Possibilities: - The spin rate mirror a fastball too, so that it looks like a fastball until it suddenly moves - It is released closer to the plate so that the batter has less time to react compared to other pitches of similar speed - It breaks. Horizontally and vertically. A lot. - It is maybe particularly effective in different parts of the strike zone

Pitch Factor Analysis

Trim down the dataset

master_statcast_aggregate <- master_pitch_usage_aggregate %>% 
  filter(year > 2014,
         total_pitches > 99) %>% 
  mutate(whiff_percentage = whiffs / swings)

Velocity

velocity_whiffs_plot <- ggplot(data = master_statcast_aggregate,
                               mapping = aes(x = velocity,
                                             y = whiff_percentage)) +
  geom_point(aes(color = pitch)) +
  geom_smooth(method = "lm",
              se = FALSE,
              fullrange = TRUE,
              color = "black") +
  theme_minimal() +
  theme(legend.box.background = element_rect(colour = "black"),
        legend.title = element_text(hjust = 0.5, face = "bold"),
        axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  facet_wrap(vars(pitch)) +
  labs(title = "Influence of Pitch Velocity on Whiff Likelihood",
       x = "Velocity (mph)",
       y = "Whiffs (%)",
       color = "Pitch")

velocity_whiffs_plot
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 516 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 516 rows containing missing values (`geom_point()`).

Increased velocity is correlated to higher whiff percentages for all pitches - though the positive correlation is VERY small for changeup. It is also not too large fror sliders, but it is positive.

Spin Rate

spin_whiffs_plot <- ggplot(data = master_statcast_aggregate,
                           mapping = aes(x = spin_rate,
                                         y = whiff_percentage)) +
  geom_point(aes(color = pitch)) +
  geom_smooth(method = "lm",
              se = FALSE,
              fullrange = TRUE,
              color = "black") +
  theme_minimal() +
  theme(legend.box.background = element_rect(colour = "black"),
        legend.title = element_text(hjust = 0.5, face = "bold"),
        axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  facet_wrap(vars(pitch)) +
  labs(title = "Influence of Spin Rate on Whiff Likelihood",
       x = "Spin Rate (rpm)",
       y = "Whiffs (%)",
       color = "Pitch")

spin_whiffs_plot
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 521 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 521 rows containing missing values (`geom_point()`).

Higher spin rates are associated with higher whiff rates for all pitches. This is likely because spin rate increases movement, but might also be that high spin rates are harder to visually detect by the batter for pitch identification. The slider does not lead the pack in this comparison, but there is an overall positive correlation.

Release extension

release_whiffs_plot <- ggplot(data = master_statcast_aggregate,
                              mapping = aes(x = release_extension,
                                            y = whiff_percentage)) +
  geom_point(aes(color = pitch)) +
  geom_smooth(method = "lm",
              se = FALSE,
              fullrange = TRUE,
              color = "black") +
  theme_minimal() +
  theme(legend.box.background = element_rect(colour = "black"),
        legend.title = element_text(hjust = 0.5, face = "bold"),
        axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  facet_wrap(vars(pitch)) +
  labs(title = "Influence of Release Extension on Whiff Likelihood",
       x = "Release Extension (ft)",
       y = "Whiffs (%)",
       color = "Pitch")

release_whiffs_plot
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 518 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 518 rows containing missing values (`geom_point()`).

Release extension is generally positively correlated with whiff percentage. Interestingly, this seems to have the least effect with sliders.

Break

all_move_aggregate <- all_move_aggregate %>% 
  group_by(player_id, year, pitch) %>% 
  mutate(year = as.character(year))

statcast_move_aggregate <- master_statcast_aggregate %>% 
  group_by(player_id, year, pitch) %>% 
  inner_join(all_move_aggregate,
             by = c('player_id' = 'player_id',
                    'year' = 'year',
                    'pitch' = 'pitch')) %>% 
  select(-c(last_name, first_name, total_pitches.y, pitches_thrown, avg_speed)) %>% 
  rename(total_pitches = total_pitches.x) %>% 
  select(player_id, player_name, pitch_hand, year:whiff_percentage, rise:vertical_break)

write.csv(statcast_move_aggregate,
          "Data/Key Aggregates/Statcast and Movement Data.csv",
          row.names = TRUE)
horbreaks_whiffs_plot <- ggplot(data = statcast_move_aggregate,
                                mapping = aes(x = horizontal_break,
                                              y = whiff_percentage)) +
  geom_point(aes(color = pitch)) +
  geom_smooth(method = "lm",
              se = FALSE,
              fullrange = TRUE,
              color = "black") +
  theme_minimal() +
  theme(legend.box.background = element_rect(colour = "black"),
        legend.title = element_text(hjust = 0.5, face = "bold"),
        axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  facet_wrap(vars(pitch)) +
  labs(title = "Influence of Horizontal Break on Whiff Likelihood",
       x = "Horizontal Break (in)",
       y = "Whiffs (%)",
       color = "Pitch")

horbreaks_whiffs_plot
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).

Interestingly, the amount of horizontal break isn’t correlated with whiff percentage.

verbreaks_whiffs_plot <- ggplot(data = statcast_move_aggregate,
                                mapping = aes(x = vertical_break,
                                              y = whiff_percentage)) +
  geom_point(aes(color = pitch)) +
  geom_smooth(method = "lm",
              se = FALSE,
              fullrange = TRUE,
              color = "black") +
  theme_minimal() +
  theme(legend.box.background = element_rect(colour = "black"),
        legend.title = element_text(hjust = 0.5, face = "bold"),
        axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  facet_wrap(vars(pitch)) +
  labs(title = "Influence of Vertical Break on Whiff Likelihood",
       x = "Vertical Break (in)",
       y = "Whiffs (%)",
       color = "Pitch")

verbreaks_whiffs_plot
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).

Vertical break seems to have a bit more of an effect.

rise_whiffs_plot <- ggplot(data = statcast_move_aggregate,
                           mapping = aes(x = rise,
                                         y = whiff_percentage)) +
  geom_point(aes(color = pitch)) +
  geom_smooth(method = "lm",
              se = FALSE,
              fullrange = TRUE,
              color = "black") +
  theme_minimal() +
  theme(legend.box.background = element_rect(colour = "black"),
        legend.title = element_text(hjust = 0.5, face = "bold"),
        axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  facet_wrap(vars(pitch)) +
  labs(title = "Influence of Rise on Whiff Likelihood",
       x = "Rise (in)",
       y = "Whiffs (%)",
       color = "Pitch")

rise_whiffs_plot
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).

Whoa. Rise definitely influences whiff percentage of sliders. It seems like sinker curveball and cutter suffer from rise.

tail_whiffs_plot <- ggplot(data = statcast_move_aggregate,
                           mapping = aes(x = tail,
                                         y = whiff_percentage)) +
  geom_point(aes(color = pitch)) +
  geom_smooth(method = "lm",
              se = FALSE,
              fullrange = TRUE,
              color = "black") +
  theme_minimal() +
  theme(legend.box.background = element_rect(colour = "black"),
        legend.title = element_text(hjust = 0.5, face = "bold"),
        axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  xlim(-250, 250) +
  facet_wrap(vars(pitch)) +
  labs(title = "Influence of Tail on Whiff Likelihood",
       x = "Tail (in)",
       y = "Whiffs (%)",
       color = "Pitch")

tail_whiffs_plot
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 29 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 29 rows containing missing values (`geom_point()`).

Location

I need to pull location data for all pitches

slider_location_all_aggregate <- slider_location_all_aggregate %>%
  mutate(whiff_percentage = whiffs / swings) %>% 
  mutate(zone = recode(zone, 
                       '01' = 'Top Left', '02' = 'Top Center', '03' = 'Top Right',
                       '04' = 'Middle Left', '05' = 'Middle Center', '06' = 'Middle Right',
                       '07' = 'Bottom Left', '08' = 'Bottom Center', '09' = 'Bottom Right',
                       '10' = 'Outside Top Left', '12' = 'Outside Top Right',
                       '13' = 'Outside Bottom Left', '14' = 'Outside Bottom Right'))

slider_zone_plot <- ggplot(data = slider_location_all_aggregate,
                           mapping = aes(x = factor(zone,
                                                    level = c("Outside Top Left", "Outside Top Right",
                                                              "Top Left", "Top Center", "Top Right",
                                                              "Middle Left", "Middle Center", "Middle Right",
                                                              "Bottom Left", "Bottom Center", "Bottom Right",
                                                              "Outside Bottom Left", "Outside Bottom Right")),
                                         y = whiff_percentage)) +
  geom_point(alpha = 0.1) +
  geom_jitter(alpha = 0.1) +
  stat_summary(geom = "point",fun.y = "mean",
               col = "red", fill = "red",
               size = 5, shape = 19, alpha = 1) +
  stat_summary(fun.data = mean_se, geom = "errorbar",
               col = "red", fill = "red", alpha = 1) +
  theme_minimal() +
  theme(legend.box.background = element_rect(colour = "black"),
        legend.title = element_text(hjust = 0.5, face = "bold"),
        axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  labs(title = "Influence of Pitch Location on Whiff Likelihood",
       x = "Location Zone",
       y = "Whiffs (%)",
       color = "Zone")
## Warning: The `fun.y` argument of `stat_summary()` is deprecated as of ggplot2 3.3.0.
## ℹ Please use the `fun` argument instead.
## Warning in stat_summary(fun.data = mean_se, geom = "errorbar", col = "red", :
## Ignoring unknown parameters: `fill`
slider_zone_plot
## Warning: Removed 5470 rows containing non-finite values (`stat_summary()`).
## Warning: Removed 5470 rows containing non-finite values (`stat_summary()`).
## Warning: Removed 5470 rows containing missing values (`geom_point()`).
## Removed 5470 rows containing missing values (`geom_point()`).

There isn’t much difference between pitches in the top and bottom of the zone or outside top. Whiffs are very infrequent in the middle. Whiffs are much higher when outside and below the zone.

Let’s see if there’s any difference by year

slider_zone_year_plot <- slider_zone_plot + 
facet_wrap(vars(year))

slider_zone_year_plot
## Warning: Removed 5470 rows containing non-finite values (`stat_summary()`).
## Removed 5470 rows containing non-finite values (`stat_summary()`).
## Warning: Removed 5470 rows containing missing values (`geom_point()`).
## Removed 5470 rows containing missing values (`geom_point()`).

Nope. The trend is pretty much the same.

Let’s make one condensed figure that highlights all the IVs that influence whiff percentage for sliders.

statcast_slider_move_df <- statcast_move_aggregate %>% 
  filter(pitch == "Slider")

slider_velo_plot <- ggplot(data = statcast_slider_move_df,
                               mapping = aes(x = velocity,
                                             y = whiff_percentage)) +
  geom_point(aes(color = "magenta")) +
  geom_smooth(method = "lm",
              se = FALSE,
              fullrange = TRUE,
              color = "black") +
  theme_minimal() +
  theme(legend.box.background = element_rect(colour = "black"),
        legend.title = element_text(hjust = 0.5, face = "bold"),
        axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  labs(title = "Influence of Pitch Velocity on Whiff Likelihood",
       x = "Velocity (mph)",
       y = "Whiffs (%)")

slider_spin_plot <- ggplot(data = statcast_slider_move_df,
                               mapping = aes(x = spin_rate,
                                             y = whiff_percentage)) +
  geom_point(aes(color = "magenta")) +
  geom_smooth(method = "lm",
              se = FALSE,
              fullrange = TRUE,
              color = "black") +
  theme_minimal() +
  theme(legend.box.background = element_rect(colour = "black"),
        legend.title = element_text(hjust = 0.5, face = "bold"),
        axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  labs(title = "Influence of Spin Rate on Whiff Likelihood",
       x = "Spin Rate (rpm)",
       y = "Whiffs (%)")

slider_release_plot <- ggplot(data = statcast_slider_move_df,
                              mapping = aes(x = release_extension,
                                            y = whiff_percentage)) +
  geom_point(aes(color = "magenta")) +
  geom_smooth(method = "lm",
              se = FALSE,
              fullrange = TRUE,
              color = "black") +
  theme_minimal() +
  theme(legend.box.background = element_rect(colour = "black"),
        legend.title = element_text(hjust = 0.5, face = "bold"),
        axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  labs(title = "Influence of Release Extension on Whiff Likelihood",
       x = "Release Extension (ft)",
       y = "Whiffs (%)")

slider_horizontal_plot <- ggplot(data = statcast_slider_move_df,
                                 mapping = aes(x = horizontal_break,
                                               y = whiff_percentage)) +
  geom_point(aes(color = "magenta")) +
  geom_smooth(method = "lm",
              se = FALSE,
              fullrange = TRUE,
              color = "black") +
  theme_minimal() +
  theme(legend.box.background = element_rect(colour = "black"),
        legend.title = element_text(hjust = 0.5, face = "bold"),
        axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  labs(title = "Influence of Hor. Movement on Whiff Likelihood",
       x = "Horizontal Break (in)",
       y = "Whiffs (%)")

slider_vertical_plot <- ggplot(data = statcast_slider_move_df,
                               mapping = aes(x = vertical_break,
                                             y = whiff_percentage)) +
  geom_point(aes(color = "magenta")) +
  geom_smooth(method = "lm",
              se = FALSE,
              fullrange = TRUE,
              color = "black") +
  theme_minimal() +
  theme(legend.box.background = element_rect(colour = "black"),
        legend.title = element_text(hjust = 0.5, face = "bold"),
        axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  labs(title = "Influence of Ver. Movement on Whiff Likelihood",
       x = "Vertical Break (in)",
       y = "Whiffs (%)")

slider_tail_plot <- ggplot(data = statcast_slider_move_df,
                           mapping = aes(x = tail,
                                         y = whiff_percentage)) +
  geom_point(aes(color = "magenta")) +
  geom_smooth(method = "lm",
              se = FALSE,
              fullrange = TRUE,
              color = "black") +
  theme_minimal() +
  theme(legend.box.background = element_rect(colour = "black"),
        legend.title = element_text(hjust = 0.5, face = "bold"),
        axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  labs(title = "Influence of Tail on Whiff Likelihood",
       x = "Horizontal Break (in)",
       y = "Whiffs (%)")

slider_rise_plot <- ggplot(data = statcast_slider_move_df,
                           mapping = aes(x = rise,
                                         y = whiff_percentage)) +
  geom_point(aes(color = "magenta")) +
  geom_smooth(method = "lm",
              se = FALSE,
              fullrange = TRUE,
              color = "black") +
  theme_minimal() +
  theme(legend.box.background = element_rect(colour = "black"),
        legend.title = element_text(hjust = 0.5, face = "bold"),
        axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  labs(title = "Influence of Rise on Whiff Likelihood",
       x = "Horizontal Break (in)",
       y = "Whiffs (%)")

slider_factor_plot <- ggarrange(slider_velo_plot, slider_spin_plot, slider_release_plot,
                                slider_horizontal_plot, slider_vertical_plot,
                                slider_rise_plot, slider_tail_plot,
                                ncol = 3,
                                nrow = 3)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
slider_factor_plot

Modeling

OK. So pitch factors like velocity, spin rate, release extension, movement, and location all seem to contribute to whether or not a batter misses when they swing. Let’s model this.

Model Data Aggregation

I need to do a final data aggregation - and this one will be a little complicated. I need to merge the movement data onto the pitch location data aggregate that I have. I do not have pitch movement data that is location-specific, so I will need to use the pitch movement that I have been using and apply it to all the locations. This is a clear limitation of my analysis but will provide more insight than not including it.

I am also going to filter out any samples with fewer than 9 swings per zone. These sample sizes are very small compared to some who threw 100+ in a given zone.

slider_move_aggregate <- all_move_aggregate %>% 
  filter(pitch == 'Slider') %>% # keep only slider
  group_by(player_id, year)

slider_move_location_stats_aggregate <- slider_location_all_aggregate %>% 
  group_by(player_id, year, zone) %>% 
  inner_join(slider_move_aggregate,
             by = c('player_id' = 'player_id',
                    'year' = 'year',
                    'pitch' = 'pitch')) %>% 
  select(-c(last_name, first_name, 
            total_pitches.y, pitches_thrown, avg_speed)) %>% 
  rename(total_pitches = total_pitches.x) %>% 
  select(player_id, player_name, pitch_hand, pitch, year, zone,
         total_pitches, rise:vertical_break, spin_rate:whiff_percentage) %>% 
  filter(swings > 9) %>% # remove tiny sample sizes
  mutate(rise = as.numeric(rise),
         tail = as.numeric(tail),
         spin_rate = as.numeric(spin_rate))

slider_move_location_stats_select <- slider_move_location_stats_aggregate %>% 
  select(player_id, year, whiff_percentage, 
         velocity, spin_rate, release_extension, horizontal_break, vertical_break,
         rise, tail, zone, whiffs, swings) %>% 
  drop_na()

So the data here is going to be slightly different than the data previously used in the location analyses and the movement analyses - because this new dataset is merged by all the common player_id and year entries.

Now let’s fit a model with all of the IVs that I’ve identified. Since whiffs is count data, I’ll fit a glm with a Poisson distribution. The standard link for a Poisson is log.

slider_whiff_model <- lm(whiff_percentage ~ velocity + spin_rate + 
                         release_extension + horizontal_break + vertical_break 
                         + rise + tail + zone,
                         data = slider_move_location_stats_aggregate)

library(performance)

check_model(slider_whiff_model)

slider_whiff_glm <- glm(whiffs ~ velocity + spin_rate + 
                        release_extension + horizontal_break + vertical_break 
                        + rise + tail + zone + offset(log(swings)),
                        family = quasipoisson(link = "log"),
                        data = slider_move_location_stats_select)

check_overdispersion(slider_whiff_glm)
## # Overdispersion test
## 
##        dispersion ratio =    1.168
##   Pearson's Chi-Squared = 9832.550
##                 p-value =  < 0.001
## Overdispersion detected.
check_model(slider_whiff_glm)

slider_move_location_stats_aggregate %>%
  as.data.frame() %>%  
  mutate(zone = as.numeric(zone),
         spin_rate = as.numeric(spin_rate),
         release_extension = as.numeric(release_extension)) %>% 
  select(velocity, spin_rate, release_extension,
         horizontal_break, vertical_break, rise, tail, zone) %>%                   # select the variables
  as.matrix() %>%                   
  cor()
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
##                      velocity  spin_rate release_extension horizontal_break
## velocity           1.00000000 -0.0405145        0.07793343      -0.55914569
## spin_rate         -0.04051450  1.0000000       -0.14056713       0.45530184
## release_extension  0.07793343 -0.1405671        1.00000000       0.02718865
## horizontal_break  -0.55914569  0.4553018        0.02718865       1.00000000
## vertical_break    -0.74540034  0.2493097       -0.15969172       0.53144721
## rise              -0.10041640  0.3200381       -0.03399798       0.21361197
## tail              -0.10153489  0.5056075        0.01321109       0.73626409
## zone                       NA         NA                NA               NA
##                   vertical_break        rise        tail zone
## velocity              -0.7454003 -0.10041640 -0.10153489   NA
## spin_rate              0.2493097  0.32003812  0.50560752   NA
## release_extension     -0.1596917 -0.03399798  0.01321109   NA
## horizontal_break       0.5314472  0.21361197  0.73626409   NA
## vertical_break         1.0000000  0.68484811  0.24464024   NA
## rise                   0.6848481  1.00000000  0.27720658   NA
## tail                   0.2446402  0.27720658  1.00000000   NA
## zone                          NA          NA          NA    1

It’s pretty clear there is no collinearity between any of the IVs. I’m comfortable with my model going forward.

Now let’s analyze the model.

summary(slider_whiff_model)
## 
## Call:
## lm(formula = whiff_percentage ~ velocity + spin_rate + release_extension + 
##     horizontal_break + vertical_break + rise + tail + zone, data = slider_move_location_stats_aggregate)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.54794 -0.08086 -0.00563  0.07944  0.59269 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -1.827e-01  1.601e-01  -1.141  0.25387    
## velocity                  3.752e-03  1.383e-03   2.713  0.00669 ** 
## spin_rate                 1.544e-05  7.691e-06   2.008  0.04464 *  
## release_extension         3.463e-03  3.182e-03   1.088  0.27653    
## horizontal_break          1.295e-03  7.071e-04   1.831  0.06715 .  
## vertical_break            5.094e-04  1.102e-03   0.462  0.64387    
## rise                      3.763e-04  4.482e-04   0.840  0.40113    
## tail                     -1.156e-04  4.149e-05  -2.788  0.00532 ** 
## zoneBottom Left           3.712e-02  6.613e-03   5.613 2.05e-08 ***
## zoneBottom Right          8.714e-02  5.349e-03  16.291  < 2e-16 ***
## zoneMiddle Center        -8.785e-02  5.340e-03 -16.451  < 2e-16 ***
## zoneMiddle Left          -7.861e-02  6.414e-03 -12.256  < 2e-16 ***
## zoneMiddle Right         -4.528e-02  5.618e-03  -8.060 8.69e-16 ***
## zoneOutside Bottom Left   3.352e-01  6.274e-03  53.428  < 2e-16 ***
## zoneOutside Bottom Right  3.988e-01  5.155e-03  77.359  < 2e-16 ***
## zoneOutside Top Left      1.014e-01  1.753e-02   5.784 7.56e-09 ***
## zoneOutside Top Right     7.805e-02  1.010e-02   7.730 1.20e-14 ***
## zoneTop Center            1.197e-02  8.097e-03   1.478  0.13932    
## zoneTop Left              3.355e-02  1.263e-02   2.657  0.00790 ** 
## zoneTop Right             5.168e-03  1.019e-02   0.507  0.61215    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1277 on 8417 degrees of freedom
## Multiple R-squared:  0.6533, Adjusted R-squared:  0.6525 
## F-statistic: 834.8 on 19 and 8417 DF,  p-value: < 2.2e-16
r2(slider_whiff_model)
## # R2 for Linear Regression
##        R2: 0.653
##   adj. R2: 0.653
r2(slider_whiff_glm)
## # R2 for Generalized Linear Regression
##   Nagelkerke's R2: 0.970

The only locations that increase whiff likelihood are outside bottom right, outside bottom left, and outside top left. Basically, don’t throw pitches in the zone. But make them look like they’re going to be in the zone.

All pitch factors except tail increase whiff percentage. Vertical break has the largest effect, followed by velocity, release extension, and rise.

Let’s visualize this.

I’m going to start by looking at zone.

library(visreg) # for visualizing model
library(ggplot2) # for plotting features
library(ggpubr) # for combining plotting objects

zone_visreg <- visreg(slider_whiff_model,
                      "zone",
                      gg = TRUE,
                      line.par = list(col = "red",
                                      alpha = 1)) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  labs(title = "Influence of Pitch Location",
       subtitle = "On Whiff Likelihood",
       x = "Pitch Location",
       y = "Whiff Percentage")

zone_visreg

Throwing a slider in the middle of the plate is bad. Throwing it low and outside the zone is best - particularly to the right. This is likely because most pitchers and batters are righties so the slider breaks down and away from the batter in this zone.

library(visreg)
library(ggforce)

velo_visreg_byzone <- visreg(slider_whiff_glm,
                             "velocity",
                             by = "zone",
                             gg = TRUE,
                             partial = FALSE,
                             rug = FALSE,
                             line.par = list(col = "black",
                                             alpha = 1)) +
  geom_point(aes(color = zone),
             size = 1, alpha = 0.1) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1, size = 6),
        legend.position = "none") +
  facet_grid_paginate(~factor(zone),
                      ncol = 3) +
  labs(title = "Influence of Velocity on Whiffs",
       x = "Velocity (mph)",
       y = "Whiffs",
       color = "Pitch Location")

velo_visreg_byzone

Uptrend in whiff percentage as velocity increases.

spin_visreg_byzone <- visreg(slider_whiff_glm,
                             "spin_rate",
                             by = "zone",
                             gg = TRUE,
                             partial = FALSE,
                             rug = FALSE,
                             line.par = list(col = "black",
                                             alpha = 1)) +
  geom_point(aes(color = zone),
             size = 1, alpha = 0.1) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1, size = 6),
        legend.position = "none") +
  facet_grid_paginate(~factor(zone),
                      ncol = 3) +
  labs(title = "Influence of Spin Rate on Whiffs",
       x = "Spin Rate (rpm)",
       y = "Whiffs",
       color = "Pitch Location")

spin_visreg_byzone

release_visreg_byzone <- visreg(slider_whiff_glm,
                      "release_extension",
                      by = "zone",
                      gg = TRUE,
                      partial = FALSE,
                      rug = FALSE,
                      line.par = list(col = "black",
                                      alpha = 1)) +
  geom_point(aes(color = zone),
             size = 1, alpha = 0.1) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1, size = 6),
        legend.position = "none") +
  facet_grid_paginate(~factor(zone),
                      ncol = 3) +
  labs(title = "Influence of Release Extension on Whiffs",
       x = "Release Extension (ft)",
       y = "Whiffs",
       color = "Pitch Location")

release_visreg_byzone

horizontal_break_visreg_byzone <- visreg(slider_whiff_glm,
                                         "horizontal_break",
                                         by = "zone",
                                         gg = TRUE,
                                         partial = FALSE,
                                         rug = FALSE,
                                         line.par = list(col = "black",
                                                         alpha = 1)) +
  geom_point(aes(color = zone),
             size = 1, alpha = 0.1) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1, size = 6),
        legend.position = "none") +
  facet_grid_paginate(~factor(zone),
                      ncol = 3) +
  labs(title = "Influence of Horizontal Break on Whiffs",
       x = "Horizontal Break (in)",
       y = "Whiffs",
       color = "Pitch Location")

horizontal_break_visreg_byzone

vertical_break_visreg_byzone <- visreg(slider_whiff_glm,
                                       "vertical_break",
                                        by = "zone",
                                        gg = TRUE,
                                        partial = FALSE,
                                        rug = FALSE,
                                        line.par = list(col = "black",
                                                        alpha = 1)) +
  geom_point(aes(color = zone),
             size = 1, alpha = 0.1) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1, size = 6),
        legend.position = "none") +
  facet_grid_paginate(~factor(zone),
                      ncol = 3) +
  labs(title = "Influence of Vertical Break on Whiffs",
       x = "Vertical Break (in)",
       y = "Whiffs",
       color = "Pitch Location")
vertical_break_visreg_byzone

library(visreg)
library(ggplot2)
library(ggforce)

tail_break_visreg_byzone <- visreg(slider_whiff_glm,
                                   "tail",
                                    by = "zone",
                                    gg = TRUE,
                                    partial = FALSE,
                                    rug = FALSE,
                                    line.par = list(col = "black",
                                                    alpha = 1)) +
  geom_point(aes(color = zone),
             size = 1, alpha = 0.1) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1, size = 6),
        legend.position = "none") +
  facet_grid_paginate(~factor(zone),
                      ncol = 3) +
  labs(title = "Influence of Pitch Tail on Whiffs",
       x = "Tail (in)",
       y = "Whiffs",
       color = "Pitch Location")

tail_break_visreg_byzone

rise_break_visreg_byzone <- visreg(slider_whiff_glm,
                                   "rise",
                                   by = "zone",
                                   gg = TRUE,
                                   partial = FALSE,
                                   rug = FALSE,
                                   line.par = list(col = "black",
                                                   alpha = 1)) +
  geom_point(aes(color = zone),
             size = 1, alpha = 0.1) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1, size = 6),
        legend.position = "none") +
  facet_grid_paginate(~factor(zone),
                      ncol = 3) +
  labs(title = "Influence of Pitch Rise on Whiffs",
       x = "Rise (in)",
       y = "Whiffs",
       color = "Pitch Location")

rise_break_visreg_byzone